Data mining Project - University of Pisa, acedemic year 2023/24
Authors: Giacomo Aru, Giulia Ghisolfi, Luca Marini, Irene Testa
We import the libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as mplt
from sklearn.decomposition import PCA
import plotly.express as px
import warnings
import json
np.warnings = warnings # altrimenti numpy da problemi con pyclustering, TODO: è un problema solo mio?
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.metrics import davies_bouldin_score, calinski_harabasz_score, silhouette_score, silhouette_samples, adjusted_rand_score
from sklearn.metrics import homogeneity_score, completeness_score, normalized_mutual_info_score
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, roc_auc_score
from sklearn.cluster import KMeans, BisectingKMeans
from scipy.spatial.distance import pdist, squareform
from pyclustering.cluster.xmeans import xmeans, splitting_type
from pyclustering.cluster.center_initializer import kmeans_plusplus_initializer
from yellowbrick.cluster import KElbowVisualizer, SilhouetteVisualizer, InterclusterDistance
import utm
import os
import sys
sys.path.append(os.path.abspath('..'))
from plot_utils import *
from clustering_utils import *
%matplotlib inline
pd.set_option('display.max_columns', None)
pd.set_option('max_colwidth', None)
We load the data:
incidents_df = pd.read_csv(
'../data/incidents_indicators.csv',
index_col=0
)
f = open('../data/indicators_names.json')
features_to_cluster = json.loads(f.read())
categorical_features = [
'year', 'month', 'day_of_week', 'party', #'state', 'address_type', 'county', 'city'
'firearm', 'air_gun', 'shots', 'aggression', 'suicide',
'injuries', 'death', 'road', 'illegal_holding', 'house',
'school', 'children', 'drugs', 'officers', 'organized', 'social_reasons',
'defensive', 'workplace', 'abduction', 'unintentional'
# 'incident_characteristics1', 'incident_characteristics2'
]
# other interesting features:
# poverty_perc, date
incidents_df = incidents_df.dropna(subset=features_to_cluster)
We select the features to cluster:
features_to_cluster_no_coord = features_to_cluster[2:]
features_to_cluster_no_coord
['location_imp', 'surprisal_address_type', 'age_range', 'avg_age', 'surprisal_min_age', 'n_child_prop', 'n_teen_prop', 'surprisal_age_groups', 'n_killed_prop', 'n_injured_prop', 'n_unharmed_prop', 'n_males_prop', 'surprisal_n_males', 'surprisal_characteristics', 'n_arrested_prop', 'n_participants', 'surprisal_day']
incidents_df[features_to_cluster_no_coord].describe()
| location_imp | surprisal_address_type | age_range | avg_age | surprisal_min_age | n_child_prop | n_teen_prop | surprisal_age_groups | n_killed_prop | n_injured_prop | n_unharmed_prop | n_males_prop | surprisal_n_males | surprisal_characteristics | n_arrested_prop | n_participants | surprisal_day | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 1.318110e+05 | 131811.000000 | 131811.000000 | 131811.000000 | 131811.000000 | 131811.000000 | 131811.000000 | 131811.000000 | 131811.000000 | 131811.000000 | 131811.000000 | 131811.00000 | 131811.000000 | 131811.000000 | 131811.000000 | 131811.000000 | 131811.000000 |
| mean | 1.360195e-02 | 1.405525 | 3.456138 | 30.200264 | 4.901767 | 0.011723 | 0.075311 | 2.339911 | 0.220584 | 0.300006 | 0.158516 | 0.87350 | 1.854277 | 3.773041 | 0.321602 | 1.817299 | 5.716236 |
| std | 4.479459e-02 | 1.469670 | 8.140794 | 12.340120 | 1.097396 | 0.096645 | 0.244843 | 1.574419 | 0.360213 | 0.394918 | 0.295476 | 0.25724 | 1.275196 | 1.667965 | 0.409666 | 1.107308 | 1.153584 |
| min | 1.000000e-07 | -0.000000 | 0.000000 | 0.000000 | -0.000000 | 0.000000 | 0.000000 | -0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.00000 | -0.000000 | -0.000000 | 0.000000 | 1.000000 | -0.000000 |
| 25% | 1.000000e-07 | 0.398084 | 0.000000 | 21.000000 | 4.169925 | 0.000000 | 0.000000 | 1.222392 | 0.000000 | 0.000000 | 0.000000 | 1.00000 | 0.947533 | 2.465123 | 0.000000 | 1.000000 | 4.963474 |
| 50% | 1.000000e-05 | 0.681367 | 0.000000 | 27.000000 | 4.807355 | 0.000000 | 0.000000 | 1.807355 | 0.000000 | 0.000000 | 0.000000 | 1.00000 | 1.392317 | 3.668379 | 0.000000 | 2.000000 | 5.754888 |
| 75% | 1.000000e-05 | 2.152003 | 2.000000 | 36.000000 | 5.554589 | 0.000000 | 0.000000 | 2.982977 | 0.500000 | 0.500000 | 0.250000 | 1.00000 | 2.321928 | 5.000000 | 0.600000 | 2.000000 | 6.491853 |
| max | 6.467426e-01 | 9.489848 | 82.000000 | 101.000000 | 9.614710 | 1.000000 | 1.000000 | 9.614710 | 1.000000 | 1.000000 | 1.000000 | 1.00000 | 9.489848 | 9.614710 | 1.000000 | 63.000000 | 9.614710 |
We display the distribution of the selected features:
fig, ax = plt.subplots(figsize=(15, 5))
sns.violinplot(data=incidents_df[features_to_cluster_no_coord],ax=ax)
plt.xticks(rotation=90, ha='right');
In order to obtain meaningful results, we must ensure that no feature presents too high magnitude that could overshadow the contributions of others. To implement this we normalize all features between 0 and 1.
from sklearn.preprocessing import MinMaxScaler
scaler_obj = MinMaxScaler()
normalized_indicators = pd.DataFrame(data=scaler_obj.fit_transform(incidents_df[features_to_cluster_no_coord].values), columns=features_to_cluster_no_coord)
We plot the features distribution after the normalization:
fig, ax = plt.subplots(figsize=(15, 5))
sns.violinplot(data=normalized_indicators,ax=ax)
plt.xticks(rotation=90, ha='right');
We calculate the principal component decomposition of the indicators chosen for clustering:
pca = PCA()
X_pca = pca.fit_transform(normalized_indicators)
pca_df = pd.DataFrame(index=incidents_df.index)
We visualize the distribution of the features in the space defined by the first two principal components:
nrows=4
ncols=5
row=0
fig, axs = mplt.subplots(nrows=nrows, ncols=ncols, figsize=(20, 20), sharex=True, sharey=True)
for i, col in enumerate(normalized_indicators.columns):
if i != 0 and i % ncols == 0:
row += 1
axs[row][i % ncols].scatter(X_pca[:, 0], X_pca[:, 1], edgecolor='k', s=40, c=normalized_indicators[col], cmap='viridis')
axs[row][i % ncols].set_title(col)
axs[row][i % ncols].set_xlabel("1st eigenvector")
axs[row][i % ncols].set_ylabel("2nd eigenvector")
We observe that:
We visualize the distribution of the features in the space defined by the third and fourth principal components:
nrows=4
ncols=5
row=0
fig, axs = mplt.subplots(nrows=nrows, ncols=ncols, figsize=(20, 20), sharex=True, sharey=True)
for i, col in enumerate(normalized_indicators.columns):
if i != 0 and i % ncols == 0:
row += 1
axs[row][i % ncols].scatter(X_pca[:, 2], X_pca[:, 3], edgecolor='k', s=40, c=normalized_indicators[col], cmap='viridis')
axs[row][i % ncols].set_title(col)
axs[row][i % ncols].set_xlabel("3rd eigenvector")
axs[row][i % ncols].set_ylabel("4th eigenvector")
We observe that:
We display incidents in vector space of the first 3 PC:
x = X_pca[:, 0]
y = X_pca[:, 2]
z = X_pca[:, 1]
fig = px.scatter_3d(x=x, y=y, z=z, labels={'x': '1st eigenvector', 'y': '3rd eigenvector', 'z': '2nd eigenvector'})
fig.show()
To narrow the number of PC down to the most relevant ones we plot the explained variance of each component, and relate it to the previous one:
exp_var_pca = pca.explained_variance_ratio_
diff_var = []
for i, var in enumerate(exp_var_pca[:-1]):
diff_var.append( var-exp_var_pca[i+1])
xtick = []
gap = 0
for i, var in enumerate(diff_var):
xtick.append(i+gap)
if i != 0 and diff_var[i-1] <= var:
gap += 0.5
if gap == 0.5:
plt.axvline(x = i+gap+0.25, color = 'green', linestyle = '-.', alpha=0.5, label='possible cut')
else:
plt.axvline(x = i+gap+0.25, color = 'green', linestyle = '-.', alpha=0.5)
print(xtick)
xtick.append(xtick[-1]+1.5)
plt.bar(xtick, exp_var_pca, align='center')
plt.plot(xtick[1:], diff_var, label='difference from prevoius variance', color='orange')
plt.ylabel('Explained variance ratio')
plt.xlabel('Principal component')
plt.title('Explained variance by principal component')
plt.xticks(xtick, range(exp_var_pca.shape[0]))
plt.legend();
[0, 1, 2.5, 3.5, 4.5, 6.0, 7.0, 8.0, 9.5, 10.5, 12.0, 13.0, 14.5, 15.5, 16.5, 18.0]
The most significant PC are the third, fourth and fifth
def get_reconstruction_error(x_pca, x_orig, pca, n_comp):
dummy = np.matmul(x_pca[:,:n_comp], pca.components_[:n_comp,:]) + pca.mean_
return pd.DataFrame(index=x_orig.index, data=np.sum((dummy - x_orig.values)**2, axis=1))
pca_col = [
'1st_comp',
'2nd_comp',
'3rd_comp',
'4th_comp',
'5th_comp',
'6th_comp',
'7th_comp',
'8th_comp',
'9th_comp',
'10th_comp',
'11th_comp',
'12th_comp',
'13th_comp',
'14th_comp',
'15th_comp',
'16th_comp',
'17th_comp'
]
pca_indicators = pd.DataFrame(index=normalized_indicators.index, data=X_pca, columns=pca_col)
We calculate the dataset reconstruction error for the first 8, 10 and 12 components:
pca_indicators['PCA_rec_error_8C'] = get_reconstruction_error(X_pca, normalized_indicators, pca, 8)
pca_indicators['PCA_rec_error_10C'] = get_reconstruction_error(X_pca, normalized_indicators, pca, 10)
pca_indicators['PCA_rec_error_12C'] = get_reconstruction_error(X_pca, normalized_indicators, pca, 12)
We display the distribution of the principal components and of the reconstruction error:
fig, ax = plt.subplots(figsize=(15, 5))
sns.violinplot(data=pca_indicators,ax=ax)
plt.xticks(rotation=90, ha='right');
We normalize the principal components to apply the clustering algorithm on them:
pca_normalized_indicators = pd.DataFrame(data=scaler_obj.fit_transform(pca_indicators.values), columns=pca_indicators.columns)
We display the normalized distributions:
fig, ax = plt.subplots(figsize=(15, 5))
sns.violinplot(data=pca_normalized_indicators,ax=ax)
plt.xticks(rotation=90, ha='right');
We will use the first 10 components, striking a balance between minimizing the number of components and mitigating the reconstruction error. Indeed, as shown below, the reconstruction error generated by these 10 components is quite uniform and close to zero:
hist_box_plot(
pca_normalized_indicators,
'PCA_rec_error_10C',
title='PCA_rec_error_10C',
bins=int(np.log(pca_normalized_indicators.shape[0])), # Sturger's rule
figsize=(10, 5)
)
clustered_components = pca_col[:-7]
X = pca_normalized_indicators[clustered_components].values
Below we define the parameters of the k-means algorithm:
MAX_ITER = 300
N_INIT = 10
INIT_METHOD = 'k-means++'
MAX_K = 30
RANDOM_STATE = 7
The following function uses the implementation of the elbow method from the library Yellowbrick, to identify the best value of k. This method consists in computing a metric to evaluate the quality of the clustering for each value of k, and then plotting the metric as a function of k. The best value of k is the one that corresponds to the point of inflection of the curve (the point where the metric starts to decrease more slowly).
def apply_k_elbow_method(X, kmeans_params, metric, start_k, max_k, plot_elbow=True):
if metric == 'SSE':
metric = 'distortion'
metric_descr = 'SSE'
elif metric == 'calinski_harabasz':
metric_descr = 'Calinski Harabasz Score'
elif metric == 'silhouette':
metric_descr = 'Silhouette Score'
else:
raise ValueError('Metric not supported')
_, axs = plt.subplots(nrows=1, ncols=len(max_k) if len(max_k)!= 1 else 2, figsize=(30,5))
for i in range(len(max_k)):
kmeans_params['n_clusters'] = i
kmeans = KMeans(**kmeans_params)
elbow_vis = KElbowVisualizer(kmeans, k=(start_k, max_k[i]), metric=metric, timings=False, ax=axs[i], locate_elbow=plot_elbow)
elbow_vis.fit(X)
axs[i].set_title(f'{metric_descr} elbow for K-Means clustering (K in [{str(start_k)}, {str(max_k[i] - 1)}])')
axs[i].set_ylabel(metric_descr)
axs[i].set_xlabel('K')
if plot_elbow:
axs[i].legend([
f'{metric_descr}',
f'elbow at K = {str(elbow_vis.elbow_value_)}, {metric_descr} = {elbow_vis.elbow_score_:0.2f}'
])
else:
axs[i].legend([
f'{metric_descr}'
])
if len(max_k)==1:
axs[1].remove()
plt.show()
Now we apply the elbow method evaluating the SSE and computing the elbow for each value of k between 1 and 10, 1 and 20 and 1 and 30. We consider also k=1 (absence of clusters) and we repeat the analysis with k in different ranges because the inflection point of the curve could vary depending on the number of points the curve is made of.
kmeans_params = {'init': INIT_METHOD, 'n_init': N_INIT, 'max_iter': MAX_ITER, 'random_state': RANDOM_STATE}
apply_k_elbow_method(X=X, kmeans_params=kmeans_params, metric='SSE', start_k=1, max_k=[10, 20, 30])
The elbow method identifies k=4 and k=6 as best possible values of k. For both values of k the elbow is evident.
best_k = [4, 6]
We repeat the analysis using the silhouette score.
apply_k_elbow_method(X=X, kmeans_params=kmeans_params, metric='silhouette', start_k=2, max_k=[15], plot_elbow=False)
The curve generated above is not monotonic. Indeed there are several local maxima (for k equals to 4, 6, 8, 12 and 14).
best_k += [4, 8, 12, 14]
We repeat again the analysing evaluating the Calinski-Harabasz score (the ratio of the sum of between-cluster dispersion and of within-cluster dispersion; the higher it is the better the clustering).
apply_k_elbow_method(X=X, kmeans_params=kmeans_params, metric='calinski_harabasz', start_k=2, max_k=[15], plot_elbow=False)
There is a single global maximum at k=4.
best_k += [4]
best_k = sorted(list(set(best_k)))
best_k
[4, 6, 8, 12, 14]
To identify the best value of k we also apply the X-means algorithm from the library pyclustering, which is a variant of the k-means algorithm that should automatically find the best value of k. The algorithm starts with k=2 and then it iteratively splits the clusters until a score does not improve anymore. The implementation we will use supports both the BIC score (Bayesian Information Criterion) and the Minimum Noiseless Descriptionlength score (MDL):
initial_centers = kmeans_plusplus_initializer(data=X, amount_centers=1, random_state=RANDOM_STATE).initialize()
xmeans_MDL_instance = xmeans(
data=X,
initial_centers=initial_centers,
kmax=MAX_K,
splitting_type=splitting_type.BAYESIAN_INFORMATION_CRITERION,
random_state=RANDOM_STATE
)
xmeans_MDL_instance.process()
n_xmeans_BIC_clusters = len(xmeans_MDL_instance.get_clusters())
print(f'Number of clusters found by xmeans using BIC score and setting the maximum number of clusters to {MAX_K}: {n_xmeans_BIC_clusters}')
Number of clusters found by xmeans using BIC score and setting the maximum number of clusters to 30: 30
xmeans_MDL_instance = xmeans(
data=X,
initial_centers=initial_centers,
kmax=MAX_K,
splitting_type=splitting_type.MINIMUM_NOISELESS_DESCRIPTION_LENGTH,
random_state=RANDOM_STATE
)
xmeans_MDL_instance.process()
n_xmeans_MDL_clusters = len(xmeans_MDL_instance.get_clusters())
print(f'Number of clusters found by xmeans using MDL score and setting the maximum number of clusters to {MAX_K}: {n_xmeans_MDL_clusters}')
Number of clusters found by xmeans using MDL score and setting the maximum number of clusters to 30: 30
X-means terminates with k equal to the maximum number of clusters allowed (30 in our case). This means that the score always improved when splitting the clusters.
To choose the best value of k among the ones identified by the elbow method, we will compute other metrics to evaluate the quality of the clustering. The following function fits the k-means algorithm with a given set of parameters and computes the following metrics:
def fit_kmeans(X, params):
print(f"Fitting KMeans with k = {params['n_clusters']}")
kmeans = KMeans(**params)
kmeans.fit(X)
results = {}
results['model'] = kmeans
results['SSE'] = kmeans.inertia_
results['BSS'] = compute_bss_per_cluster(X=X, clusters=kmeans.labels_, centroids=kmeans.cluster_centers_, weighted=True).sum()
results['davies_bouldin_score'] = davies_bouldin_score(X=X, labels=kmeans.labels_)
results['calinski_harabasz_score'] = calinski_harabasz_score(X=X, labels=kmeans.labels_)
results['silhouette_score'] = silhouette_score(X=X, labels=kmeans.labels_)
results['n_iter'] = kmeans.n_iter_
return results
To study the effect of the centroids initialization on the results, we will apply the algorithm using the k-means++ initialization repeated 10 times (as previously done)
results = {}
kmeans_params = {}
kmeans_params['random_state'] = RANDOM_STATE
kmeans_params['max_iter'] = MAX_ITER
best_k = sorted(best_k)
for k in best_k:
kmeans_params['n_init'] = N_INIT
kmeans_params['n_clusters'] = k
kmeans_params['init'] = INIT_METHOD
result = fit_kmeans(X=X, params=kmeans_params)
results[str(k)+'_means'] = result
Fitting KMeans with k = 4 Fitting KMeans with k = 6 Fitting KMeans with k = 8 Fitting KMeans with k = 12 Fitting KMeans with k = 14
results_df = pd.DataFrame(results).T
results_df.drop(columns=['model'])
| SSE | BSS | davies_bouldin_score | calinski_harabasz_score | silhouette_score | n_iter | |
|---|---|---|---|---|---|---|
| 4_means | 18305.348439 | 19658.272828 | 1.20318 | 47182.621765 | 0.323521 | 4 |
| 6_means | 15456.863569 | 22506.989109 | 1.381129 | 38384.094779 | 0.297882 | 23 |
| 8_means | 13852.767292 | 24111.127348 | 1.389156 | 32771.883493 | 0.304901 | 11 |
| 12_means | 11344.086077 | 26618.260214 | 1.409811 | 28115.676488 | 0.304028 | 13 |
| 14_means | 10602.07617 | 27362.344516 | 1.438507 | 26164.385843 | 0.308434 | 21 |
We observe that:
best_k = [4, 14]
best_k
[4, 14]
We visualize the size of the clusters with the best values of k:
fig, axs = plt.subplots(nrows=1, ncols=len(best_k), figsize=(25,5))
for i in range(len(best_k)):
k = best_k[i]
plot_clusters_size(clusters=results[f'{k}_means']['model'].labels_, ax=axs[i], title=f'{best_k[i]}-Means clusters size', color_palette=sns.color_palette('tab10'))
We visualize the silhouette score for each point with the best values of k:
fig, axs = plt.subplots(nrows=1, ncols=len(best_k), figsize=(30,10), sharey=True)
for i in range(len(best_k)):
clusters = results[f'{best_k[i]}_means']['model'].labels_
silhouette_per_point = silhouette_samples(X=X, labels=clusters)
results[f'{best_k[i]}_means']['silhouette_per_point'] = silhouette_per_point
plot_scores_per_point(
score_per_point=silhouette_per_point,
clusters=clusters,
score_name='Silhouette score', ax=axs[i],
title=f'Silhouette score for {best_k[i]}-Means clustering',
color_palette=sns.color_palette('tab20'),
minx=-0.18
)
For both values of k the resulting clusters are well-balanced. The difference between the number of points in the largest and in the smallest cluster is similar in the two solutions.
For k=4, we observe fewer data points with negative silhouettes. While, for k=14, more points have a negative silhouette score, especially those in the smaller-sized clusters.
We will use k=4 because a smaller number of clusters makes their understanding simpler and more effective, and according to Occam's razor, the simplest solution is the best one.
We initialize the centroids with the final centroids computed by BisectingKMeans.
chosen_k = 4
bisect_kmeans = BisectingKMeans(n_clusters=chosen_k, n_init=5, init=INIT_METHOD, random_state=RANDOM_STATE).fit(X)
kmeans_params = {}
kmeans_params['random_state'] = RANDOM_STATE
kmeans_params['max_iter'] = MAX_ITER
kmeans_params['n_clusters'] = chosen_k
kmeans_params['n_init'] = 1
kmeans_params['init'] = bisect_kmeans.cluster_centers_
final_result = fit_kmeans(X=X, params=kmeans_params)
kmeans = final_result['model']
clusters = kmeans.labels_
incidents_df['cluster'] = clusters
centroids = kmeans.cluster_centers_
Fitting KMeans with k = 4
pd.DataFrame(final_result, index=['k=4']).drop(columns=['model'])
| SSE | BSS | davies_bouldin_score | calinski_harabasz_score | silhouette_score | n_iter | |
|---|---|---|---|---|---|---|
| k=4 | 18305.347879 | 19658.069367 | 1.203178 | 47182.621916 | 0.323521 | 5 |
We visualize the centroids with a parallel coordinates plot:
for j in range(0, len(centroids)):
plt.plot(centroids[j], marker='o', label='Cluster %s' % j, c=sns.color_palette('tab10')[j])
plt.tick_params(axis='both', which='major', labelsize=10)
plt.xticks(range(0, len(clustered_components)), clustered_components, rotation=90)
plt.legend(fontsize=10)
plt.title(f'Centroids of {chosen_k}-means clusters');
We observe that, as expected, the first components are the ones that most characterize the centroids of the clusters. Diversity decreases as moving through the principal components.
We remind that the 3 principal components were correlated to the following features:
For this reason, we expect that the clusters primarily differ in these 4 features.
We visualize the same information using a interactive radar plot:
def plot_spider(points, features, title=None, palette=sns.color_palette(), save=False):
df = pd.DataFrame()
for i, center in enumerate(points):
tmp_df = pd.DataFrame(dict(r=center, theta=features))
tmp_df['Centroid'] = f'Centroid {i}'
df = pd.concat([df,tmp_df], axis=0)
with warnings.catch_warnings():
warnings.simplefilter("ignore", FutureWarning)
fig = px.line_polar(df, r='r', theta='theta', line_close=True, color='Centroid', color_discrete_sequence=palette.as_hex())
fig.update_layout(title=title)
fig.show()
if save:
pyo.plot(fig, filename=f'../html/centroids_spider_PCA.html', auto_open=False)
plot_spider(points=centroids, features=clustered_components, title=f'Centroids of {chosen_k}-means clusters', palette=sns.color_palette('tab10'))
We define a function to convert back the centroid in the original feature space:
def inverse_trasform_k_comp(x_pca, pca, n_comp):
return np.matmul(x_pca[:,:n_comp], pca.components_[:n_comp,:]) + pca.mean_
We plot again the centroids in the original feature space:
transformed_centroids = inverse_trasform_k_comp(centroids, pca, 10)
plot_spider(points=transformed_centroids, features=features_to_cluster_no_coord, title=f'Centroids of {chosen_k}-means clusters', palette=sns.color_palette('tab10'), save=True)
for j in range(0, len(centroids)):
plt.plot(transformed_centroids[j], marker='o', label='Cluster %s' % j, c=sns.color_palette('tab10')[j])
plt.tick_params(axis='both', which='major', labelsize=10)
plt.xticks(range(0, len(features_to_cluster_no_coord)), features_to_cluster_no_coord, rotation=90)
plt.legend(fontsize=10)
plt.title(f'Centroids of {k}-means clusters');
We observe that the negative values of the attribute n_arrested_prop (that should always be positive) is due to the reconstruction error.
Clusters differ also in the following features:
Cluster 0 groups incidents with an high number of unharmed people. Cluster 1 groups incidents with an high number of killed people. Cluster 2 groups incidents with an high number of injured people (and a slightly higher-than-average value of number of teens). Cluster 3 groups incidents with an high number of arrested people and a low number of unharmed people, killed people and injured people.
We also visualize the values of the principal components in the original feature space:
plot_spider(points=pca.components_[:10], features=features_to_cluster_no_coord, title=f'Centroids of {chosen_k}-means clusters', palette=sns.color_palette('tab10'))
for j in range(0, len(pca.components_[:10])):
plt.plot(pca.components_[j], marker='o', label='Cluster %s' % j, c=sns.color_palette('tab10')[j])
plt.tick_params(axis='both', which='major', labelsize=10)
plt.xticks(range(0, len(features_to_cluster_no_coord)), features_to_cluster_no_coord, rotation=90)
plt.title(f'Centroids of {chosen_k}-means clusters');
most_identifying_columns = [
'avg_age',
'n_killed_prop',
'n_injured_prop',
'n_unharmed_prop',
'n_males_prop',
'surprisal_n_males',
'n_arrested_prop',
'surprisal_day'
]
We plot on a map the points in each cluster:
plot_scattermap_plotly(
incidents_df,
'cluster',
zoom=2.5,
height=600,
title=f'Point divided by cluster',
black_nan=False
).show()
Incidents are not clustered according to their geographical location. All clusters are evenly distributed, even in areas with fewer incidents like Hawaii or Alaska.
Now we inspect the distribution of categorical features within the clusters:
plot_bars_by_cluster(df=incidents_df, feature='year', cluster_column='cluster')
For clusters 1 and 2, the distribution of the year of the incident is in line with that of the entire dataset. Cluster 3 groups fewer incidents happened in 2014, while cluster 0 groups more incidents happened in 2014.
incidents_df.loc[incidents_df['year']==2014.0].describe()[features_to_cluster_no_coord]
| location_imp | surprisal_address_type | age_range | avg_age | surprisal_min_age | n_child_prop | n_teen_prop | surprisal_age_groups | n_killed_prop | n_injured_prop | n_unharmed_prop | n_males_prop | surprisal_n_males | surprisal_characteristics | n_arrested_prop | n_participants | surprisal_day | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 2.587400e+04 | 25874.000000 | 25874.000000 | 25874.000000 | 25874.000000 | 25874.000000 | 25874.000000 | 25874.000000 | 25874.000000 | 25874.000000 | 25874.000000 | 25874.000000 | 25874.000000 | 25874.000000 | 25874.000000 | 25874.000000 | 25874.000000 |
| mean | 1.538016e-02 | 1.448300 | 3.915939 | 30.295045 | 4.660649 | 0.013315 | 0.070726 | 2.457913 | 0.234085 | 0.289425 | 0.387231 | 0.853493 | 1.979149 | 3.705849 | 0.066539 | 1.965564 | 5.486566 |
| std | 4.998616e-02 | 1.434740 | 8.828470 | 12.790472 | 1.101723 | 0.100046 | 0.235341 | 1.487953 | 0.367183 | 0.390405 | 0.403275 | 0.261810 | 1.221860 | 1.626836 | 0.214640 | 1.160093 | 1.160186 |
| min | 1.000000e-07 | -0.000000 | 0.000000 | 1.000000 | -0.000000 | 0.000000 | 0.000000 | -0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | -0.000000 | -0.000000 | 0.000000 | 1.000000 | -0.000000 |
| 25% | 1.000000e-07 | 0.444785 | 0.000000 | 21.000000 | 3.906891 | 0.000000 | 0.000000 | 1.415037 | 0.000000 | 0.000000 | 0.000000 | 0.666667 | 1.133267 | 2.459432 | 0.000000 | 1.000000 | 4.718818 |
| 50% | 1.000000e-07 | 0.777608 | 0.000000 | 27.000000 | 4.554589 | 0.000000 | 0.000000 | 1.968291 | 0.000000 | 0.000000 | 0.400000 | 1.000000 | 1.624491 | 3.565854 | 0.000000 | 2.000000 | 5.523562 |
| 75% | 1.000000e-05 | 2.192645 | 3.000000 | 36.000000 | 5.321928 | 0.000000 | 0.000000 | 3.111508 | 0.500000 | 0.500000 | 0.666667 | 1.000000 | 2.415037 | 4.906891 | 0.000000 | 2.000000 | 6.285402 |
| max | 5.584028e-01 | 8.682995 | 79.000000 | 101.000000 | 9.271463 | 1.000000 | 1.000000 | 9.271463 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 8.682995 | 9.271463 | 1.000000 | 20.000000 | 9.271463 |
incidents_df.loc[incidents_df['year']>2014.0].describe()[features_to_cluster_no_coord]
| location_imp | surprisal_address_type | age_range | avg_age | surprisal_min_age | n_child_prop | n_teen_prop | surprisal_age_groups | n_killed_prop | n_injured_prop | n_unharmed_prop | n_males_prop | surprisal_n_males | surprisal_characteristics | n_arrested_prop | n_participants | surprisal_day | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 1.057540e+05 | 105754.000000 | 105754.000000 | 105754.000000 | 105754.000000 | 105754.000000 | 105754.000000 | 105754.000000 | 105754.000000 | 105754.000000 | 105754.000000 | 105754.000000 | 105754.000000 | 105754.000000 | 105754.000000 | 105754.000000 | 105754.000000 |
| mean | 1.317844e-02 | 1.397060 | 3.323534 | 30.181024 | 4.968146 | 0.011320 | 0.076411 | 2.314066 | 0.217208 | 0.302138 | 0.102766 | 0.878613 | 1.825876 | 3.795250 | 0.384326 | 1.774571 | 5.781167 |
| std | 4.343735e-02 | 1.478241 | 7.925659 | 12.230416 | 1.073769 | 0.095811 | 0.247185 | 1.593001 | 0.358518 | 0.395956 | 0.230398 | 0.255843 | 1.285837 | 1.672906 | 0.421839 | 1.074995 | 1.125654 |
| min | 1.000000e-07 | -0.000000 | 0.000000 | 0.000000 | -0.000000 | 0.000000 | 0.000000 | -0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | -0.000000 | -0.000000 | 0.000000 | 1.000000 | -0.000000 |
| 25% | 1.000000e-07 | 0.395929 | 0.000000 | 21.000000 | 4.224966 | 0.000000 | 0.000000 | 1.180572 | 0.000000 | 0.000000 | 0.000000 | 1.000000 | 0.925999 | 2.478047 | 0.000000 | 1.000000 | 5.044394 |
| 50% | 1.000000e-05 | 0.660514 | 0.000000 | 27.000000 | 4.857981 | 0.000000 | 0.000000 | 1.766544 | 0.000000 | 0.000000 | 0.000000 | 1.000000 | 1.306661 | 3.700440 | 0.250000 | 2.000000 | 5.807355 |
| 75% | 1.000000e-05 | 2.137504 | 2.000000 | 36.000000 | 5.614710 | 0.000000 | 0.000000 | 2.938599 | 0.500000 | 0.500000 | 0.000000 | 1.000000 | 2.297681 | 5.003752 | 1.000000 | 2.000000 | 6.544321 |
| max | 6.467426e-01 | 9.489848 | 82.000000 | 101.000000 | 9.614710 | 1.000000 | 1.000000 | 9.614710 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 9.489848 | 9.614710 | 1.000000 | 63.000000 | 9.614710 |
| year | n_participants mean | n_arrested mean/ratio | n_unharmed mean/ratio |
|---|---|---|---|
| =2014 | 1.965 | 0.066 / 0.033 | 0.387 / 0.196 |
| >2014 | 1.774 | 0.384 / 0.216 | 0.102 / 0.001 |
In 2014, the average number of arrests is 0.06 and the proportion between the average number of arrestees and participants is 0.03, whereas in subsequent years the two values are higher (0.38 and 0.21 respectively).
plot_bars_by_cluster(df=incidents_df, feature='party', cluster_column='cluster')
In cluster 3 - characterized by the highest value of n_arrestes_prop - the proportion of incidents happened in Republican states is higher than those happened in Democratic states. This is probably due to variations in the law enforcement policies.
plot_bars_by_cluster(df=incidents_df, feature='firearm', cluster_column='cluster')
dummy_df = incidents_df[
(incidents_df['firearm']==False)]['cluster'].value_counts().to_frame()
dummy_df['percentage'] = dummy_df['count']/sum(dummy_df['count'])
dummy_df
| count | percentage | |
|---|---|---|
| cluster | ||
| 0 | 946 | 0.758013 |
| 3 | 171 | 0.137019 |
| 2 | 77 | 0.061699 |
| 1 | 54 | 0.043269 |
Almost all incidents that did not involve firearms belong the cluster with the highest number of unharmed.
plot_bars_by_cluster(df=incidents_df, feature='shots', cluster_column='cluster')
The vast majority of incidents within clusters 1 and 2 are shooting incidents and are the incidents that mostly caused deaths and injuries. Cluster 3 groups less shooting incidents.
plot_bars_by_cluster(df=incidents_df, feature='aggression', cluster_column='cluster')
As expected, cluster 3 mainly comprises non-aggressive incidents, while cluster 2, almost entirely comprises aggressive incidents. The distribution of aggressions in cluster 0 is similar to the distribution in the whole dataset. Cluster 1, identified with the highest number of killed people, exhibits the largest difference between the number of aggressive and non-aggression incidents.
incidents_df.groupby('aggression').describe()[['n_killed_prop', 'n_injured_prop', 'n_arrested_prop']]
| n_killed_prop | n_injured_prop | n_arrested_prop | ||||||||||||||||||||||
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | mean | std | min | 25% | 50% | 75% | max | count | mean | std | min | 25% | 50% | 75% | max | count | mean | std | min | 25% | 50% | 75% | max | |
| aggression | ||||||||||||||||||||||||
| False | 58449.0 | 0.415296 | 0.428928 | 0.0 | 0.0 | 0.333333 | 1.0 | 1.0 | 58449.0 | 0.019555 | 0.119545 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 58449.0 | 0.430669 | 0.446516 | 0.0 | 0.0 | 0.5 | 1.0 | 1.0 |
| True | 73362.0 | 0.065454 | 0.179670 | 0.0 | 0.0 | 0.000000 | 0.0 | 1.0 | 73362.0 | 0.523447 | 0.395273 | 0.0 | 0.0 | 0.5 | 1.0 | 1.0 | 73362.0 | 0.234706 | 0.354490 | 0.0 | 0.0 | 0.0 | 0.5 | 1.0 |
The average number of deaths increases to 0.415 for aggressive incidents and drops to 0.065 for non aggressive incidents.
plot_bars_by_cluster(df=incidents_df, feature='suicide', cluster_column='cluster')
dummy_df = incidents_df[
(incidents_df['suicide']==True)]['cluster'].value_counts().to_frame()
dummy_df['percentage'] = dummy_df['count']/sum(dummy_df['count'])
dummy_df
| count | percentage | |
|---|---|---|
| cluster | ||
| 1 | 3350 | 0.925414 |
| 2 | 133 | 0.036740 |
| 0 | 70 | 0.019337 |
| 3 | 67 | 0.018508 |
Suicides are almost entirely contained within cluster 1.
plot_bars_by_cluster(df=incidents_df, feature='injuries', cluster_column='cluster')
dummy_df = incidents_df[
(incidents_df['suicide']==True)]['cluster'].value_counts().to_frame()
dummy_df['percentage'] = dummy_df['count']/sum(dummy_df['count'])
dummy_df
| count | percentage | |
|---|---|---|
| cluster | ||
| 1 | 3350 | 0.925414 |
| 2 | 133 | 0.036740 |
| 0 | 70 | 0.019337 |
| 3 | 67 | 0.018508 |
Cluster 2 mainly groups incidents involving injuries. Cluster 1 presents a lower number of incidents involving injuries. Half of the incidents not involving injuries are in cluster 3, the other half are evenly distributed between clusters 0 and 1.
plot_bars_by_cluster(df=incidents_df, feature='death', cluster_column='cluster')
dummy_df = incidents_df[
(incidents_df['suicide']==True)]['cluster'].value_counts().to_frame()
dummy_df['percentage'] = dummy_df['count']/sum(dummy_df['count'])
dummy_df
| count | percentage | |
|---|---|---|
| cluster | ||
| 1 | 3350 | 0.925414 |
| 2 | 133 | 0.036740 |
| 0 | 70 | 0.019337 |
| 3 | 67 | 0.018508 |
Cluster 1 mainly groups mortal incidents. Some mortal incidents are also in cluster 2.
plot_bars_by_cluster(df=incidents_df, feature='illegal_holding', cluster_column='cluster')
dummy_df = incidents_df[
(incidents_df['suicide']==True)]['cluster'].value_counts().to_frame()
dummy_df['percentage'] = dummy_df['count']/sum(dummy_df['count'])
dummy_df
| count | percentage | |
|---|---|---|
| cluster | ||
| 1 | 3350 | 0.925414 |
| 2 | 133 | 0.036740 |
| 0 | 70 | 0.019337 |
| 3 | 67 | 0.018508 |
In clusters 1 and 2, few incidents are tagged as 'illegal_holding'. Incidents exhibiting this tag are mainly grouped in clusters 0 and 3 (most of them did not result in injuries or deaths).
plot_bars_by_cluster(df=incidents_df, feature='house', cluster_column='cluster')
dummy_df = incidents_df[
(incidents_df['suicide']==True)]['cluster'].value_counts().to_frame()
dummy_df['percentage'] = dummy_df['count']/sum(dummy_df['count'])
dummy_df
| count | percentage | |
|---|---|---|
| cluster | ||
| 1 | 3350 | 0.925414 |
| 2 | 133 | 0.036740 |
| 0 | 70 | 0.019337 |
| 3 | 67 | 0.018508 |
plot_bars_by_cluster(df=incidents_df, feature='school', cluster_column='cluster')
dummy_df = incidents_df[
(incidents_df['suicide']==True)]['cluster'].value_counts().to_frame()
dummy_df['percentage'] = dummy_df['count']/sum(dummy_df['count'])
dummy_df
| count | percentage | |
|---|---|---|
| cluster | ||
| 1 | 3350 | 0.925414 |
| 2 | 133 | 0.036740 |
| 0 | 70 | 0.019337 |
| 3 | 67 | 0.018508 |
We observe that incidents that occurred in schools have, on average, resulted in many arrest. The majority of them involve few participants and a peaceful resolution of the event.
plot_bars_by_cluster(df=incidents_df, feature='drugs', cluster_column='cluster')
dummy_df = incidents_df[
(incidents_df['suicide']==True)]['cluster'].value_counts().to_frame()
dummy_df['percentage'] = dummy_df['count']/sum(dummy_df['count'])
dummy_df
| count | percentage | |
|---|---|---|
| cluster | ||
| 1 | 3350 | 0.925414 |
| 2 | 133 | 0.036740 |
| 0 | 70 | 0.019337 |
| 3 | 67 | 0.018508 |
Cluster 3 groups most of the incidents involving drugs. This means that these incidents often have peaceful resolutions and frequently involve arrests.
plot_bars_by_cluster(df=incidents_df, feature='officers', cluster_column='cluster')
Incidents involving officers exhibit higher numbers of arrests.
plot_bars_by_cluster(df=incidents_df, feature='workplace', cluster_column='cluster')
Incidents happened at workplace are mainly grouped in clusters 0 and 3. Generally, these incidents conclude peacefully and often result in arrests.
plot_bars_by_cluster(df=incidents_df, feature='defensive', cluster_column='cluster')
dummy_df = incidents_df[
(incidents_df['suicide']==True)]['cluster'].value_counts().to_frame()
dummy_df['percentage'] = dummy_df['count']/sum(dummy_df['count'])
dummy_df
| count | percentage | |
|---|---|---|
| cluster | ||
| 1 | 3350 | 0.925414 |
| 2 | 133 | 0.036740 |
| 0 | 70 | 0.019337 |
| 3 | 67 | 0.018508 |
The majority of incidents identified as 'defensive' are included in cluster 0 (the cluster with higher numbers of unharmed).
We visualize the identified clusters projected onto the most influential dimensions in the clustering, i.e.:
scatter_by_cluster(
df=incidents_df,
features=most_identifying_columns,
cluster_column='cluster',
centroids=transformed_centroids[:, [most_identifying_columns.index(feature) for feature in most_identifying_columns]],
figsize=(15, 20),
ncols=3,
color_palette=sns.color_palette('tab10')
)
plt.tight_layout()
We visualize the clusters in the space of the first 6 components of PCA. As expected the first principal components better separate the 4 clusters.
palette = [sns.color_palette('tab10')[i] for i in range(chosen_k)]
scatter_pca_features_by_cluster(
X_pca=X_pca,
n_components=6,
clusters=incidents_df['cluster'],
palette=palette,
hue_order=None,
title='Clusters in PCA space'
)
We plot the distributions of the features used for clustering.
for feature in features_to_cluster:
plot_hists_by_cluster(
df=incidents_df,
feature=feature,
cluster_column='cluster',
title=f'Distribution of {feature} in each cluster',
color_palette=sns.color_palette('tab10'),
figsize=(30, 5)
)
These plots confirm the observations made so far. We observe that in cluster 2, the value of 'suprisal_charactestic' is lower than in the other clusters.
We compute the sum of squared error for each separate feature:
sse_feature = []
for i in range(X.shape[1]):
sse_feature.append(compute_se_per_point(X=X[:,i], clusters=clusters, centroids=transformed_centroids[:,i]).sum())
plt.figure(figsize=(15, 5))
sse_feature_sorted, clustering_features_sorted = zip(*sorted(zip(sse_feature, clustered_components)))
plt.bar(range(len(sse_feature_sorted)), sse_feature_sorted)
plt.xticks(range(len(sse_feature_sorted)), clustering_features_sorted)
plt.xticks(rotation=90)
plt.ylabel('SSE')
plt.xlabel('Feature')
plt.title('SSE per feature')
Text(0.5, 1.0, 'SSE per feature')
We compute and plot the silhouette score for each point:
fig, axs = plt.subplots(1, figsize=(8,5))
silhouette_per_point = silhouette_samples(X=X, labels=clusters)
plot_scores_per_point(
score_per_point=silhouette_per_point,
clusters=incidents_df['cluster'],
score_name='Silhouette score', ax=axs,
title=f'Silhouette score for {k}-Means clustering',
color_palette=sns.color_palette('tab10'),
minx=-0.02
)
As already observed, few points have a negative silhouette score. Cluster 0 has lower silhouette scores than the other clusters.
# print top 5 points with highest SSE
se_per_point = compute_se_per_point(X=X, clusters=clusters, centroids=centroids)
indices_of_top_contributors = np.argsort(se_per_point)[-5:]
incidents_df.iloc[indices_of_top_contributors]
| date | date_original | year | month | day | day_of_week | state | address | latitude | longitude | county | city | address_type | congd | state_house_district | state_senate_district | px_code | participant_age1 | participant1_child | participant1_teen | participant1_adult | participant1_male | participant1_female | min_age | max_age | n_child | n_teen | n_adult | n_males | n_females | n_killed | n_injured | n_arrested | n_unharmed | notes | incident_characteristics1 | incident_characteristics2 | firearm | air_gun | shots | aggression | suicide | injuries | death | road | illegal_holding | house | school | children | drugs | officers | organized | social_reasons | defensive | workplace | abduction | unintentional | poverty_perc | party | candidate_votes | total_votes | candidate_perc | population_state_2010 | lat_proj | lon_proj | location_imp | surprisal_address_type | age_range | avg_age | surprisal_min_age | log_min_age_mean_ratio | n_child_prop | n_teen_prop | surprisal_age_groups | severity | n_killed_prop | surprisal_n_killed | log_n_killed_mean_ratio | n_injured_prop | log_n_injured_mean_ratio | surprisal_n_injured | n_unharmed_prop | n_males_prop | log_n_males_mean_ratio | surprisal_n_males | surprisal_characteristics | n_arrested_prop | log_n_participants_mean_ratio | surprisal_n_participants | n_participants | surprisal_day | cluster | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 116805 | 2014-07-19 | 2014-07-19 | 2014.0 | 7 | 19 | 5 | CALIFORNIA | 12th and Filbert | 37.8074 | -122.2830 | Alameda County | Oakland | amenity | 13.0 | 18.0 | 9.0 | CA | 10.0 | True | False | False | False | True | 10.0 | 10.0 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 1.0 | 0.0 | 0.0 | NaN | Shot - Wounded/Injured | NaN | True | False | True | True | False | True | False | False | False | False | False | True | False | False | False | False | False | False | False | False | 14.5 | DEMOCRAT | 168491.0 | 190431.0 | 88.478767 | 37253956 | 563115.378637 | 4.184688e+06 | 0.000010 | 3.543543 | 0.0 | 10.0 | 9.271463 | -1.038644 | 1.0 | 0.0 | 8.271463 | 0.3 | 0.0 | 0.104045 | 0.003328 | 1.0 | 2.192754 | 3.464108 | 0.0 | 0.0 | 0.002256 | 6.464108 | 9.271463 | 0.0 | -0.644059 | 2.913911 | 1.0 | 9.271463 | 2 |
| 185844 | 2015-01-01 | 2015-01-01 | 2015.0 | 1 | 1 | 3 | WISCONSIN | 2700 block of N. Holton St | 43.0675 | -87.9053 | Milwaukee County | Milwaukee | tourism | 4.0 | 16.0 | 6.0 | WI | 15.0 | False | True | False | False | True | 15.0 | 15.0 | 0.0 | 2.0 | 0.0 | 0.0 | 1.0 | 0.0 | 1.0 | 0.0 | 1.0 | Teen seriously wounded after New Years shooting. | Shot - Wounded/Injured | NaN | True | False | True | True | False | True | False | False | False | False | False | False | False | False | False | False | False | False | False | False | 10.5 | DEMOCRAT | 179045.0 | 254892.0 | 70.243476 | 5686986 | 426291.101979 | 4.768708e+06 | 0.000010 | 7.539159 | 0.0 | 15.0 | 4.731804 | -0.546545 | 0.0 | 1.0 | 6.539159 | 0.3 | 0.0 | 0.516791 | 0.003328 | 0.5 | 0.191550 | 0.909802 | 0.5 | 0.0 | 0.002256 | 4.217231 | 1.349334 | 0.0 | 0.176110 | 1.954196 | 2.0 | 7.539159 | 0 |
| 109349 | 2014-04-30 | 2014-04-30 | 2014.0 | 4 | 30 | 2 | SOUTH CAROLINA | NaN | 32.2860 | -80.7214 | Beaufort County | NaN | county | 1.0 | 123.0 | 46.0 | SC | 12.0 | False | True | False | False | True | 12.0 | 12.0 | 0.0 | 1.0 | 0.0 | 0.0 | 1.0 | 1.0 | 0.0 | 0.0 | 0.0 | NaN | Shot - Dead (murder, accidental, suicide) | Suicide^ | True | False | True | False | True | False | True | False | False | False | False | False | False | False | False | False | False | False | False | False | 14.9 | REPUBLICAN | 119392.0 | 127815.0 | 93.410007 | 4625364 | 526233.291331 | 3.572171e+06 | 0.475885 | 6.303781 | 0.0 | 12.0 | 6.303781 | -0.761491 | 0.0 | 1.0 | 5.303781 | 0.7 | 1.0 | 2.303781 | 1.597534 | 0.0 | 0.003328 | 0.373043 | 0.0 | 0.0 | 0.002256 | 6.303781 | 6.303781 | 0.0 | -0.785174 | 2.216318 | 1.0 | 6.303781 | 1 |
| 102815 | 2017-01-14 | 2017-01-14 | 2017.0 | 1 | 14 | 5 | LOUISIANA | 2200 block of Westbend Parkway | 29.9297 | -90.0196 | Orleans Parish | New Orleans | leisure | 2.0 | 102.0 | 7.0 | LA | 0.0 | True | False | False | False | True | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 1.0 | 0.0 | 0.0 | 1 inj | Shot - Wounded/Injured | Child Involved Incident | True | False | True | True | False | True | False | False | False | False | False | True | False | False | False | False | False | False | False | False | 19.1 | DEMOCRAT | 198289.0 | 284269.0 | 69.754001 | 4533372 | 787720.693084 | 3.314731e+06 | 0.000010 | 9.129283 | 0.0 | 0.0 | 9.129283 | 0.000259 | 1.0 | 0.0 | 8.129283 | 0.3 | 0.0 | 0.371060 | 0.003328 | 1.0 | 0.384787 | 1.057821 | 0.0 | 0.0 | 0.002256 | 4.374396 | 6.807355 | 0.0 | -0.511643 | 0.830075 | 1.0 | 6.807355 | 2 |
| 214525 | 2017-01-18 | 2017-01-18 | 2017.0 | 1 | 18 | 2 | LOUISIANA | 5100 Oaklon Avenue | 30.5087 | -91.1414 | East Baton Rouge Parish | Baton Rouge | tourism | 2.0 | 29.0 | 14.0 | LA | 15.0 | False | True | False | False | True | 15.0 | 15.0 | 0.0 | 1.0 | 0.0 | 0.0 | 1.0 | 1.0 | 0.0 | 0.0 | 0.0 | 1 killed | Shot - Dead (murder, accidental, suicide) | NaN | True | False | True | False | False | False | True | False | False | False | False | False | False | False | False | False | False | False | False | False | 19.1 | DEMOCRAT | 198289.0 | 284269.0 | 69.754001 | 4533372 | 678351.631724 | 3.376625e+06 | 0.000010 | 9.129283 | 0.0 | 15.0 | 6.807355 | -0.658834 | 0.0 | 1.0 | 5.129283 | 0.7 | 1.0 | 2.271302 | 1.365943 | 0.0 | 0.003328 | 1.204471 | 0.0 | 0.0 | 0.002256 | 4.374396 | 2.843881 | 0.0 | -0.511643 | 0.830075 | 1.0 | 7.129283 | 1 |
All these points have an high number of participants.
The number of participants contributes a lot to the SSE.
To evaluate the separation we also display an embedding of the cluster centers in 2 dimensions, using the implementation of Yellowbrick:
visualizer = InterclusterDistance(kmeans)
visualizer.fit(X)
visualizer.show();
c:\Users\lucam\anaconda3\envs\python3115\Lib\site-packages\sklearn\manifold\_mds.py:298: FutureWarning: The default value of `normalized_stress` will change to `'auto'` in version 1.4. To suppress this warning, manually set the value of `normalized_stress`.
Clusters are well separated.
We now compute cohesion (SSE) and separation (BSS) for each cluster and visualize it:
# compute cohesion for each cluster
se_per_cluster = np.zeros(chosen_k)
sizes = np.ones(centroids.shape[0])
for i in range(chosen_k):
se_per_cluster[i] = np.sum(se_per_point[np.where(clusters == i)[0]])/sizes[i]
# compute separation for each cluster
bss_per_cluster = compute_bss_per_cluster(X, clusters, centroids, weighted=True)
# compute average silhouette score for each cluster
silhouette_per_cluster = np.zeros(chosen_k)
for i in range(chosen_k):
silhouette_per_cluster[i] = silhouette_per_point[np.where(clusters == i)[0]].mean()
# visualize the result
fig, axs = plt.subplots(nrows=1, ncols=3, figsize=(20,5))
axs[0].bar(range(chosen_k), se_per_cluster, color=sns.color_palette('tab10'))
axs[0].set_ylim(8000, 0)
axs[0].set_title('Cohesion')
axs[0].set_ylabel('SSE')
axs[1].bar(range(chosen_k), bss_per_cluster, color=sns.color_palette('tab10'))
axs[1].set_title('Separation')
axs[1].set_ylabel('BSS')
axs[2].bar(range(chosen_k), silhouette_per_cluster, color=sns.color_palette('tab10'))
axs[2].set_title('Silhouette')
axs[2].set_ylabel('Silhouette score')
for i in range(3):
axs[i].set_xlabel('Cluster')
axs[i].set_xticks(range(chosen_k))
axs[i].set_xticklabels(range(chosen_k))
plt.suptitle('Cohesion and separation measures for each cluster', fontweight='bold')
Text(0.5, 0.98, 'Cohesion and separation measures for each cluster')
Cluster 1 is the less cohesive, cluster 2 is the best separated.
We visualize the distance matrix sorted by cluster computed on a stratified subsample of 5000 points:
dm, idm = plot_distance_matrices(X=X, n_samples=5000, clusters=clusters, random_state=RANDOM_STATE)
The pearson correlation coefficient between the two matrix is 0.64. Indeed, the matrix has a sharp block diagonal structure, meaning that clusters are well separated.
We measure the extent to which the discovered clustering structure matches some categorical features of the dataset, using the following permutation invariant scores:
incidents_df['unharmed'] = incidents_df['n_unharmed'] > 0
incidents_df['arrested'] = incidents_df['n_arrested'] > 0
incidents_df['males'] = incidents_df['n_males'] > 0
incidents_df['females'] = incidents_df['n_females'] > 0
external_score = compute_permutation_invariant_external_metrics(
incidents_df,
'cluster',
['shots', 'aggression', 'suicide', 'injuries', 'death', 'drugs', 'illegal_holding', 'unharmed', 'arrested', 'males', 'females']
)
external_score
| adjusted rand score | normalized mutual information | homogeneity | completeness | |
|---|---|---|---|---|
| feature | ||||
| shots | 0.013098 | 0.135908 | 0.235928 | 0.095445 |
| aggression | 0.119408 | 0.174358 | 0.257281 | 0.131859 |
| suicide | 0.018048 | 0.055002 | 0.320398 | 0.030083 |
| injuries | 0.196766 | 0.252092 | 0.373973 | 0.190128 |
| death | 0.167238 | 0.260939 | 0.409001 | 0.191583 |
| drugs | -0.021250 | 0.051653 | 0.149292 | 0.031229 |
| illegal_holding | -0.012607 | 0.040955 | 0.101807 | 0.025633 |
| unharmed | 0.339517 | 0.501209 | 0.841450 | 0.356898 |
| arrested | 0.453665 | 0.514149 | 0.761589 | 0.388066 |
| males | 0.006576 | 0.008411 | 0.036753 | 0.004749 |
| females | 0.005327 | 0.002778 | 0.005151 | 0.001902 |
The categories that best matche the clustering are 'unharmed' and 'arrested'. The most homogeneous category is 'unharmed'. However, completeness is quite low for all the categories.
incidents_df['cluster'].to_csv(f'../data/clustering_labels/{chosen_k}-Means_PCA_clusters.csv')
external_score.to_csv(f'../data/clustering_labels/{chosen_k}-Means_PCA_external_scores.csv')
pd.DataFrame(final_result, index=['4means PCA']).T.to_csv(f'../data/clustering_labels/{chosen_k}-Means_PCA_internal_scores.csv')